Script to reproduce years based on a model trained with random points¶
Importing¶
In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import root_mean_squared_error as rmse
from tqdm import tqdm
import dill
import random
import salishsea_tools.viz_tools as sa_vi
Datasets Preparation¶
In [ ]:
def datasets_preparation(dataset, dataset2):
drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
np.ravel(dataset['Temperature_(15m-100m)']),
np.ravel(dataset['Salinity_(0m-15m)']),
np.ravel(dataset['Salinity_(15m-100m)']),
np.ravel(dataset2['Summation_of_solar_radiation']),
np.ravel(dataset2['Mean_wind_speed'])
])
indx = np.where(~np.isnan(drivers).any(axis=0))
drivers = drivers[:,indx[0]]
diat = np.ravel(dataset['Diatom'])
diat = diat[indx[0]]
return(drivers, diat, indx)
Regressor¶
In [ ]:
def regressor (inputs, targets):
inputs = inputs.transpose()
# Regressor
X_train, _, y_train, _ = train_test_split(inputs, targets, train_size=0.35)
model = DecisionTreeRegressor()
model = make_pipeline(StandardScaler(), model)
regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train, y_train)
return (regr)
Regressor 2¶
In [ ]:
def regressor2 (inputs, targets, variable_name):
inputs2 = inputs.transpose()
outputs_test = regr.predict(inputs2)
m = scatter_plot(targets, outputs_test, variable_name)
r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
rms = rmse(targets, outputs_test)
return (r, rms, m)
Regressor 3¶
In [ ]:
def regressor3 (inputs, targets):
inputs2 = inputs.transpose()
outputs_test = regr.predict(inputs2)
# compute slope m and intercept b
m, b = np.polyfit(targets, outputs_test, deg=1)
r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
rms = rmse(targets, outputs_test)
return (r, rms, m)
Regressor 4¶
In [ ]:
def regressor4 (inputs, targets, variable_name):
inputs2 = inputs.transpose()
outputs = regr.predict(inputs2)
# Post processing
indx2 = np.full((len(diat_i.y)*len(diat_i.x)),np.nan)
indx2[indx[0]] = outputs
model = np.reshape(indx2,(len(diat_i.y),len(diat_i.x)))
m = scatter_plot(targets, outputs, variable_name + str(dates[i].date()))
# Preparation of the dataarray
model = xr.DataArray(model,
coords = {'y': diat_i.y, 'x': diat_i.x},
dims = ['y','x'],
attrs=dict( long_name = variable_name + "Concentration",
units="mmol m-2"),)
plotting3(targets, model, diat_i, variable_name)
Printing¶
In [ ]:
def printing (targets, outputs, m):
print ('The amount of data points is', outputs.size)
print ('The slope of the best fitting line is ', np.round(m,3))
print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
print (' The mean square error is:', rmse(targets,outputs))
Scatter Plot¶
In [ ]:
def scatter_plot(targets, outputs, variable_name):
# compute slope m and intercept b
m, b = np.polyfit(targets, outputs, deg=1)
printing(targets, outputs, m)
fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')
ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)
lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
np.max([ax[0].get_xlim(), ax[0].get_ylim()])]
# plot fitted y = m*x + b
ax[0].axline(xy1=(0, b), slope=m, color='r')
ax[0].set_xlabel('targets')
ax[0].set_ylabel('outputs')
ax[0].set_xlim(lims)
ax[0].set_ylim(lims)
ax[0].set_aspect('equal')
ax[0].plot(lims, lims,linestyle = '--',color = 'k')
h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet',
range=[lims,lims], cmin=0.1, norm='log')
ax[1].plot(lims, lims,linestyle = '--',color = 'k')
# plot fitted y = m*x + b
ax[1].axline(xy1=(0, b), slope=m, color='r')
ax[1].set_xlabel('targets')
ax[1].set_ylabel('outputs')
ax[1].set_aspect('equal')
fig.colorbar(h[3],ax=ax[1], location='bottom')
fig.suptitle(variable_name)
plt.show()
return (m)
Plotting¶
In [ ]:
def plotting(variable, name):
plt.plot(years,variable, marker = '.', linestyle = '')
plt.xlabel('Years')
plt.ylabel(name)
plt.show()
Plotting 2¶
In [ ]:
def plotting2(variable,title):
fig, ax = plt.subplots()
scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)
ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
fig.show()
Plotting 3¶
In [ ]:
def plotting3(targets, model, variable, variable_name):
fig, ax = plt.subplots(2,2, figsize = (10,15))
cmap = plt.get_cmap('cubehelix')
cmap.set_bad('gray')
variable.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': variable_name + ' Concentration [mmol m-2]'})
model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': variable_name + ' Concentration [mmol m-2]'})
((variable-model) / variable * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': variable_name + ' Concentration [percentage]'})
plt.subplots_adjust(left=0.1,
bottom=0.1,
right=0.95,
top=0.95,
wspace=0.35,
hspace=0.35)
sa_vi.set_aspect(ax[0,0])
sa_vi.set_aspect(ax[0,1])
sa_vi.set_aspect(ax[1,0])
ax[0,0].title.set_text(variable_name + ' (targets)')
ax[0,1].title.set_text(variable_name + ' (outputs)')
ax[1,0].title.set_text('targets - outputs')
ax[1,1].axis('off')
fig.suptitle(str(dates[i].date()))
plt.show()
Training (Random Points)¶
In [ ]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_model_var_old.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')
# ds = ds.isel(time_counter = (np.arange(0, len(ds.time_counter),2)),
# y=(np.arange(ds.y[0], ds.y[-1], 5)),
# x=(np.arange(ds.x[0], ds.x[-1], 5)))
# ds2 = ds2.isel(time_counter = (np.arange(0, len(ds2.time_counter),2)),
# y=(np.arange(ds2.y[0], ds2.y[-1], 5)),
# x=(np.arange(ds2.x[0], ds2.x[-1], 5)))
dates = pd.DatetimeIndex(ds['time_counter'].values)
drivers, diat, _ = datasets_preparation(ds, ds2)
regr = regressor(drivers, diat)
Other Years (Anually)¶
In [ ]:
years = range (2007,2024)
r_all = []
rms_all = []
slope_all = []
for year in tqdm(range (2007,2024)):
dataset = ds.sel(time_counter=str(year))
dataset2 = ds2.sel(time_counter=str(year))
drivers, diat, _ = datasets_preparation(dataset, dataset2)
r, rms, m = regressor2(drivers, diat, 'Diatom ' + str(year))
r_all.append(r)
rms_all.append(rms)
slope_all.append(m)
plotting(np.transpose(r_all), 'Correlation Coefficient')
plotting(np.transpose(rms_all), 'Root Mean Square Error')
plotting (np.transpose(slope_all), 'Slope of the best fitting line')
0%| | 0/17 [00:00<?, ?it/s]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.497 The correlation coefficient is: 0.977 The mean square error is: 0.03490124707966972
6%|▌ | 1/17 [00:43<11:40, 43.76s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3532404 The slope of the best fitting line is 0.494 The correlation coefficient is: 0.965 The mean square error is: 0.038075280649577825
12%|█▏ | 2/17 [01:32<11:42, 46.84s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.492 The correlation coefficient is: 0.98 The mean square error is: 0.03971866669458346
18%|█▊ | 3/17 [02:13<10:13, 43.85s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.499 The correlation coefficient is: 0.977 The mean square error is: 0.03129580954597224
24%|██▎ | 4/17 [02:53<09:10, 42.33s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.5 The correlation coefficient is: 0.98 The mean square error is: 0.031358808976161265
29%|██▉ | 5/17 [03:34<08:22, 41.92s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3532404 The slope of the best fitting line is 0.498 The correlation coefficient is: 0.978 The mean square error is: 0.03337135030295379
35%|███▌ | 6/17 [04:14<07:34, 41.30s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.499 The correlation coefficient is: 0.976 The mean square error is: 0.03942226122754738
41%|████ | 7/17 [04:53<06:44, 40.49s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.499 The correlation coefficient is: 0.971 The mean square error is: 0.03470721681990617
47%|████▋ | 8/17 [05:32<06:00, 40.09s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.498 The correlation coefficient is: 0.983 The mean square error is: 0.028094215093718407
53%|█████▎ | 9/17 [06:12<05:20, 40.01s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3532404 The slope of the best fitting line is 0.498 The correlation coefficient is: 0.987 The mean square error is: 0.027744103586922458
59%|█████▉ | 10/17 [06:49<04:33, 39.12s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.497 The correlation coefficient is: 0.975 The mean square error is: 0.029849395828214702
65%|██████▍ | 11/17 [07:29<03:56, 39.34s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.493 The correlation coefficient is: 0.971 The mean square error is: 0.03751021885163527
71%|███████ | 12/17 [08:06<03:13, 38.78s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.494 The correlation coefficient is: 0.972 The mean square error is: 0.04162589518162528
76%|███████▋ | 13/17 [08:44<02:34, 38.54s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3532404 The slope of the best fitting line is 0.49 The correlation coefficient is: 0.975 The mean square error is: 0.045365657283306424
82%|████████▏ | 14/17 [09:23<01:55, 38.52s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.494 The correlation coefficient is: 0.981 The mean square error is: 0.03440497182532782
88%|████████▊ | 15/17 [10:03<01:18, 39.00s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.499 The correlation coefficient is: 0.977 The mean square error is: 0.03157936497975302
94%|█████████▍| 16/17 [10:44<00:39, 39.55s/it]/tmp/ipykernel_25358/1980467486.py:4: RankWarning: Polyfit may be poorly conditioned m, b = np.polyfit(targets, outputs, deg=1)
The amount of data points is 3485925 The slope of the best fitting line is 0.498 The correlation coefficient is: 0.972 The mean square error is: 0.03898835566089332
100%|██████████| 17/17 [11:25<00:00, 40.30s/it]
Other Years (Daily)¶
In [ ]:
r_all2 = np.array([])
rms_all2 = np.array([])
slope_all2 = np.array([])
for i in tqdm(range (0, len(ds.time_counter))):
dataset = ds.isel(time_counter=i)
dataset2 = ds2.isel(time_counter=i)
drivers, diat, _ = datasets_preparation(dataset, dataset2)
r, rms, m = regressor3(drivers, diat)
r_all2 = np.append(r_all2,r)
rms_all2 = np.append(rms_all2,rms)
slope_all2 = np.append(slope_all2,m)
plotting2(r_all2, 'Correlation Coefficients')
plotting2(rms_all2, 'Root Mean Square Errors')
plotting2(slope_all2, 'Slope of the best fitting line')
100%|██████████| 1279/1279 [11:40:36<00:00, 32.87s/it]
Daily Maps¶
In [ ]:
maps = random.sample(range(0,len(ds.time_counter)),10)
for i in tqdm(maps):
dataset = ds.isel(time_counter=i)
dataset2 = ds2.isel(time_counter=i)
drivers, diat, indx = datasets_preparation(dataset, dataset2)
diat_i = dataset['Diatom']
regressor4(drivers, diat, 'Diatom ')
0%| | 0/10 [00:00<?, ?it/s]
The amount of data points is 46479 The slope of the best fitting line is 0.861 The correlation coefficient is: 0.945 The mean square error is: 0.042138232345826206
10%|█ | 1/10 [00:35<05:19, 35.50s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.869 The correlation coefficient is: 0.958 The mean square error is: 0.05927678601072566
20%|██ | 2/10 [01:09<04:38, 34.78s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.938 The correlation coefficient is: 0.966 The mean square error is: 0.04157926302484591
30%|███ | 3/10 [01:41<03:52, 33.27s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.893 The correlation coefficient is: 0.948 The mean square error is: 0.021576014985365415
40%|████ | 4/10 [02:12<03:14, 32.38s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.972 The correlation coefficient is: 0.968 The mean square error is: 0.013524567365046338
50%|█████ | 5/10 [02:43<02:39, 31.84s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.902 The correlation coefficient is: 0.933 The mean square error is: 0.0203533569691006
60%|██████ | 6/10 [03:15<02:08, 32.08s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.874 The correlation coefficient is: 0.879 The mean square error is: 0.02558484523511505
70%|███████ | 7/10 [03:45<01:34, 31.43s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.93 The correlation coefficient is: 0.959 The mean square error is: 0.034261297471123386
80%|████████ | 8/10 [04:17<01:02, 31.38s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.938 The correlation coefficient is: 0.967 The mean square error is: 0.01607637032995785
90%|█████████ | 9/10 [04:47<00:31, 31.07s/it]
The amount of data points is 46479 The slope of the best fitting line is 0.907 The correlation coefficient is: 0.958 The mean square error is: 0.018836434030312137
100%|██████████| 10/10 [05:18<00:00, 31.84s/it] 100%|██████████| 10/10 [05:18<00:00, 31.84s/it]
In [ ]: